#Load packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
#Load edited dataset

faculty_salaries <- read_csv("faculty data edited.csv")
## Parsed with column specification:
## cols(
##   Rank = col_character(),
##   Discipline = col_character(),
##   Years_PhD = col_integer(),
##   Years_Service = col_integer(),
##   Sex = col_character(),
##   Salary = col_integer()
## )
#Some initial data exploration to get an early sense of any trends

#Salary by gender
sal_by_gender <- faculty_salaries %>% 
  group_by(Sex) %>% 
  summarize(
    mean = mean(Salary)
  )

#Salary by rank (professor, associate professor, assistant professor)
sal_by_rank <- faculty_salaries %>% 
  group_by(Rank) %>% 
  summarize(
    mean = mean(Salary)
  )

#Average years (since PhD, and of service) for each rank
sal_by_bothyears <- faculty_salaries %>% 
  group_by(Rank) %>% 
  summarize(
    mean_PhD = mean(Years_PhD),
    mean_service = mean(Years_Service)
  )

#Salary by years of service (number of years as faculty), color coded for gender, separated by discipline
sal_by_service <- ggplot(faculty_salaries, aes(x = Years_Service, y = Salary)) +
  geom_point(aes(color = Sex), alpha = 0.5) +
  facet_wrap(~ Discipline)

sal_by_service

#Salary by years since PhD, color coded for gender, separated by discipline
sal_by_years <- ggplot(faculty_salaries, aes(x = Years_PhD, y = Salary)) +
  geom_point(aes(color = Sex), alpha = 0.5) +
  facet_wrap(~ Discipline)

sal_by_years

#Salary by years of service (number of years as faculty), color coded for gender, separated by rank
sal_by_service_rank <- ggplot(faculty_salaries, aes(x = Years_Service, y = Salary)) +
  geom_point(aes(color = Sex), alpha = 0.5) +
  facet_wrap(~ Rank)

sal_by_service_rank

#Salary by years since PhD, color coded for gender, separated by rank
sal_by_years_rank <- ggplot(faculty_salaries, aes(x = Years_PhD, y = Salary)) +
  geom_point(aes(color = Sex), alpha = 0.5) +
  facet_wrap(~ Rank)

sal_by_years_rank

#Mean salary by discipline (A = theoretical, B = applied)
sal_discipline_mean <- faculty_salaries %>% 
  group_by(Discipline) %>% 
  summarize(
    mean = mean(Salary)
  )
#Exploring possibilities of collinearity (focusing on years PhD, years of service)

col_df <- faculty_salaries %>% 
  select(Years_PhD, Years_Service, Salary)

pairs(col_df)

#Years_PhD and Years_Service appear to be highly correlated
#Creating a saturated model for salary, including all variables:

#Rank - professor, associate professor, assistant professor
#Discipline - A = theoretical or B = applied
#Sex - male or female
#Years_PhD - years since getting PhD
#Years_Service - years as a faculty member

sal_lm1 <- lm(Salary ~ Rank + Discipline + Sex + Years_PhD + Years_Service, data = faculty_salaries)

summary(sal_lm1)
## 
## Call:
## lm(formula = Salary ~ Rank + Discipline + Sex + Years_PhD + Years_Service, 
##     data = faculty_salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    78862.8     4990.3  15.803  < 2e-16 ***
## RankAsstProf  -12907.6     4145.3  -3.114  0.00198 ** 
## RankProf       32158.4     3540.6   9.083  < 2e-16 ***
## DisciplineB    14417.6     2342.9   6.154 1.88e-09 ***
## SexMale         4783.5     3858.7   1.240  0.21584    
## Years_PhD        535.1      241.0   2.220  0.02698 *  
## Years_Service   -489.5      211.9  -2.310  0.02143 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16
#Info from the model summary:
#For rank, the referene level is associate professor
#For sex, the reference level is female
#For discipline, the reference level is A (theoretical)
#Years_PhD and Years_Service are continuous 

#Coefficients for rank seem to make sense - compared to associate professor, a full professor makes a good deal more, and an assistant professor makes less (matches the means that were calculated above)

#Coefficient for sex seems to make sense - male faculty made more

#Coefficient for discipline seems to make sense - applied made more than theoretical

#Are years_PhD and years_service redundant? Do they sort of get at a similar thing? Which one is better? Are they both incorporated within rank?

#Based on overall p-value, the model does significantly predict salary (p < 0.001)
#Another model, this time using years_service and not years_phd

sal_lm2 <- lm(Salary ~ Rank + Discipline + Sex + Years_Service, data = faculty_salaries)

summary(sal_lm2)
## 
## Call:
## lm(formula = Salary ~ Rank + Discipline + Sex + Years_Service, 
##     data = faculty_salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64202 -14255  -1533  10571  99163 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    82912.07    4668.39  17.760  < 2e-16 ***
## RankAsstProf  -14560.40    4098.32  -3.553 0.000428 ***
## RankProf       34599.24    3382.52  10.229  < 2e-16 ***
## DisciplineB    13473.38    2315.50   5.819 1.24e-08 ***
## SexMale         4771.25    3878.00   1.230 0.219311    
## Years_Service    -88.78     111.64  -0.795 0.426958    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22650 on 391 degrees of freedom
## Multiple R-squared:  0.4478, Adjusted R-squared:  0.4407 
## F-statistic: 63.41 on 5 and 391 DF,  p-value: < 2.2e-16
#Still weird that there is a negative coefficient for years of service - doesn't make sense!
#What about a model with years since phd, instead of years of service?

sal_lm3 <- lm(Salary ~ Rank + Discipline + Sex + Years_PhD, data = faculty_salaries)

summary(sal_lm3)
## 
## Call:
## lm(formula = Salary ~ Rank + Discipline + Sex + Years_PhD, data = faculty_salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67451 -13860  -1549  10716  97023 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   80988.47    4931.84  16.422  < 2e-16 ***
## RankAsstProf -13104.15    4167.31  -3.145  0.00179 ** 
## RankProf      32928.40    3544.40   9.290  < 2e-16 ***
## DisciplineB   13937.47    2346.53   5.940 6.32e-09 ***
## SexMale        4349.37    3875.39   1.122  0.26242    
## Years_PhD        61.01     127.01   0.480  0.63124    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22660 on 391 degrees of freedom
## Multiple R-squared:  0.4472, Adjusted R-squared:  0.4401 
## F-statistic: 63.27 on 5 and 391 DF,  p-value: < 2.2e-16
#Positive coefficient for years since PhD, so better!
#Now a model with only rank, discipline, and sex (not using years since PhD or years of service)

sal_lm4 <- lm(Salary ~ Rank + Discipline + Sex, data = faculty_salaries)

summary(sal_lm4)
## 
## Call:
## lm(formula = Salary ~ Rank + Discipline + Sex, data = faculty_salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66268 -14127  -1566  10813  97718 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     81947       4506  18.187  < 2e-16 ***
## RankAsstProf   -13723       3959  -3.466 0.000586 ***
## RankProf        33680       3177  10.600  < 2e-16 ***
## DisciplineB     13709       2295   5.972 5.25e-09 ***
## SexMale          4492       3860   1.164 0.245291    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22640 on 392 degrees of freedom
## Multiple R-squared:  0.4469, Adjusted R-squared:  0.4412 
## F-statistic: 79.18 on 4 and 392 DF,  p-value: < 2.2e-16
#Model diagnostics

#For lm1 (saturated model)

plot(sal_lm1)

#Homoscedasticity seems fine, residuals normality looks pretty good for majority of data (upper tail is a little strange)

plot(sal_lm2)

#Homoscedasticity seems fine, again residuals normality looks pretty good for majority of data (upper tail is a little strange)

plot(sal_lm3)

#Homoscedasticity seems fine, again residuals normality looks pretty good for majority of data (upper tail is a little strange)

plot(sal_lm4)

#Homoscedasticity seems fine, again residuals normality looks pretty good for majority of data (upper tail is a little strange)

#For the q-q, upper tail could be influenced that as rank/years get high, salaries start to increase non-linearly
#VIF for each model

vif(sal_lm1)
##                   GVIF Df GVIF^(1/(2*Df))
## Rank          2.013193  2        1.191163
## Discipline    1.064105  1        1.031555
## Sex           1.030805  1        1.015285
## Years_PhD     7.518936  1        2.742068
## Years_Service 5.923038  1        2.433729
#Years_PhD and Years_Service both have VIF values > 4 (7.5, 5.9) - remove one of them from the model? Remove both - is this incorporated into rank?

vif(sal_lm2)
##                   GVIF Df GVIF^(1/(2*Df))
## Rank          1.597441  2        1.124233
## Discipline    1.029040  1        1.014416
## Sex           1.030803  1        1.015284
## Years_Service 1.627110  1        1.275582
vif(sal_lm3)
##                GVIF Df GVIF^(1/(2*Df))
## Rank       1.987205  2        1.187301
## Discipline 1.055727  1        1.027486
## Sex        1.028359  1        1.014080
## Years_PhD  2.065517  1        1.437191
vif(sal_lm4)
##                GVIF Df GVIF^(1/(2*Df))
## Rank       1.034437  2        1.008500
## Discipline 1.012236  1        1.006099
## Sex        1.022339  1        1.011108
#No other VIF values were higher than 4. Also, with the exception of Years_PhD in model 2, all were very close to one (and in that instance, the value was 2)

#AIC models comparison

lm1_aic <- AIC(sal_lm1) #9093.82
lm2_aic <- AIC(sal_lm2) #9096.81
lm3_aic <- AIC(sal_lm3) #9097.21
lm4_aic <- AIC(sal_lm4) #9095.45
#What about trying two models with interaction terms 

sal_lm5 <- lm(Salary ~ Rank + Sex + Discipline + Years_Service + Discipline*Years_Service, data = faculty_salaries)

summary(sal_lm5)
## 
## Call:
## lm(formula = Salary ~ Rank + Sex + Discipline + Years_Service + 
##     Discipline * Years_Service, data = faculty_salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70632 -14191  -2098   9937  94331 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                86338.5     4880.9  17.689  < 2e-16 ***
## RankAsstProf              -13899.7     4086.8  -3.401 0.000741 ***
## RankProf                   34121.8     3371.0  10.122  < 2e-16 ***
## SexMale                     5196.1     3861.9   1.345 0.179254    
## DisciplineB                 6255.7     3916.2   1.597 0.110989    
## Years_Service               -266.8      135.8  -1.965 0.050128 .  
## DisciplineB:Years_Service    406.3      178.3   2.279 0.023221 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22530 on 390 degrees of freedom
## Multiple R-squared:  0.455,  Adjusted R-squared:  0.4467 
## F-statistic: 54.27 on 6 and 390 DF,  p-value: < 2.2e-16
#What about taking out rank and leaving in one of the years?

sal_lm6 <- lm(Salary ~ Discipline + Sex + Years_Service, data = faculty_salaries)

summary(sal_lm6)
## 
## Call:
## lm(formula = Salary ~ Discipline + Sex + Years_Service, data = faculty_salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -77424 -19404  -4635  15539 105391 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    84361.1     4941.4  17.072  < 2e-16 ***
## DisciplineB    13033.8     2840.3   4.589 6.01e-06 ***
## SexMale         8423.3     4744.5   1.775   0.0766 .  
## Years_Service    832.2      110.2   7.550 3.07e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27790 on 393 degrees of freedom
## Multiple R-squared:  0.1646, Adjusted R-squared:  0.1582 
## F-statistic: 25.81 on 3 and 393 DF,  p-value: 2.928e-15
#I think model 4 is the best option, with just rank, discipline, and gender, so I'm making a table for that here. Can be changed easily to another of the models if needed

#First, I think it's nicer to have assistant professor be the reference level

faculty_salaries$Rank <- factor(faculty_salaries$Rank)

faculty_salaries$Rank <- fct_relevel(faculty_salaries$Rank, "AsstProf")


sal_lm4 <- lm(Salary ~ Rank + Discipline + Sex, data = faculty_salaries)

summary(sal_lm4)

Call: lm(formula = Salary ~ Rank + Discipline + Sex, data = faculty_salaries)

Residuals: Min 1Q Median 3Q Max -66268 -14127 -1566 10813 97718

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 68224 4477 15.238 < 2e-16 RankAssocProf 13723 3959 3.466 0.000586 RankProf 47403 3133 15.130 < 2e-16 DisciplineB 13709 2295 5.972 5.25e-09 SexMale 4492 3860 1.164 0.245291
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1

Residual standard error: 22640 on 392 degrees of freedom Multiple R-squared: 0.4469, Adjusted R-squared: 0.4412 F-statistic: 79.18 on 4 and 392 DF, p-value: < 2.2e-16

#Now making the table

sal_lm_table <- stargazer(sal_lm4, type = "html")
Dependent variable:
Salary
RankAssocProf 13,723.420***
(3,959.014)
RankProf 47,403.320***
(3,133.108)
DisciplineB 13,708.690***
(2,295.437)
SexMale 4,491.801
(3,860.240)
Constant 68,223.530***
(4,477.200)
Observations 397
R2 0.447
Adjusted R2 0.441
Residual Std. Error 22,640.990 (df = 392)
F Statistic 79.180*** (df = 4; 392)
Note: p<0.1; p<0.05; p<0.01
sal_lm_table
[1] “”
[2] “ "
[3] “ "
[4] “ "
[5] “ "
[6] “ "
[7] “ "
[8] “ "
[9] “ "
[10] “ "
[11] “ "
[12] “ "
[13] “ "
[14] “ "
[15] “ "
[16] “ "
[17] “ "
[18] “ "
[19] “ "
[20] “ "
[21] “ "
[22] “ "
[23] “ "
[24] “ "
[25] “ " [26] “
Dependent variable:
Salary
RankAssocProf 13,723.420***
(3,959.014)
RankProf 47,403.320***
(3,133.108)
DisciplineB 13,708.690***
(2,295.437)
SexMale 4,491.801
(3,860.240)
Constant 68,223.530***
(4,477.200)
Observations 397
R2 0.447
Adjusted R2 0.441
Residual Std. Error 22,640.990 (df = 392)
F Statistic 79.180*** (df = 4; 392)
Note: p<0.1; p<0.05; p<0.01

"